%matplotlib inline
#above allows plots to discplay on the screen.
#python includes
import sys
#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
#Method to load data
def getConllTags(filename):
#input: filename for a conll style parts of speech tagged file
#output: a list of list of tuples [sent]. representing [[[word1, tag], [word2, tag2]]
wordTagsPerSent = [[]]
sentNum = 0
with open(filename, encoding='utf8') as f:
for wordtag in f:
wordtag=wordtag.strip()
if wordtag:#still reading current sentence
(word, tag) = wordtag.split("\t")
wordTagsPerSent[sentNum].append((word,tag))
else:#new sentence
wordTagsPerSent.append([])
sentNum+=1
return wordTagsPerSent
corpus = 'daily547.conll'
taggedSents = getConllTags(corpus)
print(taggedSents[:2])
from pprint import pprint
#keep track of counts here:
wordCounts = dict()
bigramCounts = dict()
trigramCounts = dict()
numTrainingSents = 500
#iterate through each sentence, and extract word and bigram counts
for sent in taggedSents[:numTrainingSents]:
words = [word.lower() for word, tag in sent] # grabbing words, droppin gtags
#print("\nNext Sent:", words)
for i in range(len(words)):
try:
wordCounts[(words[i],)] += 1
except KeyError:
wordCounts[(words[i],)] = 1
#count the bigram
if (i > 0):
bigram = (words[i-1],words[i])
try:
bigramCounts[bigram] += 1
except KeyError:
bigramCounts[bigram] = 1
#count the trigrams
if (i > 1):
trigram = (words[i-2], words[i-1], words[i])
try:
trigramCounts[trigram] += 1
except KeyError:
trigramCounts[trigram] = 1
pprint(sorted(wordCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
pprint(sorted(bigramCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
pprint(sorted(trigramCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
#specify the model (e.g. bigramCounts or trigramCounts)
ngramCounts = trigramCounts
#ngramCounts = bigramCounts
ngramModelProbs = dict()# stores p(Xi|Xi-1), [x--k...x-1][xi]
for ngram, count in ngramCounts.items():
p = count / bigramCounts[ngram[0:-1]]
try:
ngramModelProbs[ngram[0:-1]][ngram[-1]] = p #indexed by [x--k...x-1][xi]
except KeyError:
ngramModelProbs[ngram[0:-1]] = {ngram[-1]: p}
pprint(sorted(ngramModelProbs[('i','love')].items()))#show probabilities for all words that could follow want
#pprint(sorted(ngramModelProbs[('i',)].items()))#show probabilities for all words that could follow i
#if time. generate a sentence
#saved code in case we want to do one-hot representation
for sent in taggedSents:
if sent:
words, tags = zip(*sent)
wordToIndex |= set(words) #union of the words into the set
tagToNum |= set(tags) #union of all the tags into the set
print("[Read ", len(taggedSents), " Sentences]")
#make dictionaries for converting words to index and tags to ids:
wordToIndex = {w: i for i, w in enumerate(wordToIndex)}
numToTag = list(tagToNum) #mapping index to tag
tagToNum = {numToTag[i]: i for i in range(len(numToTag))}